Alignmnent and counting of the fastq files

This step is performed on the O2 cluster. The fastq file quality was checked using FastQC and MultiQC. They are aligned to Ensembl mm10 genome and counted using Salmon pseudoaligner. Output sf files were transfered from O2 to local machine for further processing in R.

Library loading and set up

suppressMessages(
  c(library(DESeq2),
    library(tximport),
    library(gridExtra),
    library(ensembldb),
    library(EnsDb.Mmusculus.v79),
    library(grid),
    library(ggplot2),
    library(lattice),
    library(reshape),
    library(mixOmics),
    library(gplots),
    library(RColorBrewer),
    library(readr),
    library(dplyr),
    library(VennDiagram),
    library(clusterProfiler),
    library(DOSE),
    library(org.Mm.eg.db), 
    library(pathview),
    library(AnnotationDbi),
    library(gtools))
)
## Warning: package 'ensembldb' was built under R version 3.5.3
## Warning: package 'GenomicFeatures' was built under R version 3.5.3
##   [1] "DESeq2"               "SummarizedExperiment" "DelayedArray"        
##   [4] "BiocParallel"         "matrixStats"          "Biobase"             
##   [7] "GenomicRanges"        "GenomeInfoDb"         "IRanges"             
##  [10] "S4Vectors"            "BiocGenerics"         "parallel"            
##  [13] "stats4"               "stats"                "graphics"            
##  [16] "grDevices"            "utils"                "datasets"            
##  [19] "methods"              "base"                 "tximport"            
##  [22] "DESeq2"               "SummarizedExperiment" "DelayedArray"        
##  [25] "BiocParallel"         "matrixStats"          "Biobase"             
##  [28] "GenomicRanges"        "GenomeInfoDb"         "IRanges"             
##  [31] "S4Vectors"            "BiocGenerics"         "parallel"            
##  [34] "stats4"               "stats"                "graphics"            
##  [37] "grDevices"            "utils"                "datasets"            
##  [40] "methods"              "base"                 "gridExtra"           
##  [43] "tximport"             "DESeq2"               "SummarizedExperiment"
##  [46] "DelayedArray"         "BiocParallel"         "matrixStats"         
##  [49] "Biobase"              "GenomicRanges"        "GenomeInfoDb"        
##  [52] "IRanges"              "S4Vectors"            "BiocGenerics"        
##  [55] "parallel"             "stats4"               "stats"               
##  [58] "graphics"             "grDevices"            "utils"               
##  [61] "datasets"             "methods"              "base"                
##  [64] "ensembldb"            "AnnotationFilter"     "GenomicFeatures"     
##  [67] "AnnotationDbi"        "gridExtra"            "tximport"            
##  [70] "DESeq2"               "SummarizedExperiment" "DelayedArray"        
##  [73] "BiocParallel"         "matrixStats"          "Biobase"             
##  [76] "GenomicRanges"        "GenomeInfoDb"         "IRanges"             
##  [79] "S4Vectors"            "BiocGenerics"         "parallel"            
##  [82] "stats4"               "stats"                "graphics"            
##  [85] "grDevices"            "utils"                "datasets"            
##  [88] "methods"              "base"                 "EnsDb.Mmusculus.v79" 
##  [91] "ensembldb"            "AnnotationFilter"     "GenomicFeatures"     
##  [94] "AnnotationDbi"        "gridExtra"            "tximport"            
##  [97] "DESeq2"               "SummarizedExperiment" "DelayedArray"        
## [100] "BiocParallel"         "matrixStats"          "Biobase"             
## [103] "GenomicRanges"        "GenomeInfoDb"         "IRanges"             
## [106] "S4Vectors"            "BiocGenerics"         "parallel"            
## [109] "stats4"               "stats"                "graphics"            
## [112] "grDevices"            "utils"                "datasets"            
## [115] "methods"              "base"                 "grid"                
## [118] "EnsDb.Mmusculus.v79"  "ensembldb"            "AnnotationFilter"    
## [121] "GenomicFeatures"      "AnnotationDbi"        "gridExtra"           
## [124] "tximport"             "DESeq2"               "SummarizedExperiment"
## [127] "DelayedArray"         "BiocParallel"         "matrixStats"         
## [130] "Biobase"              "GenomicRanges"        "GenomeInfoDb"        
## [133] "IRanges"              "S4Vectors"            "BiocGenerics"        
## [136] "parallel"             "stats4"               "stats"               
## [139] "graphics"             "grDevices"            "utils"               
## [142] "datasets"             "methods"              "base"                
## [145] "ggplot2"              "grid"                 "EnsDb.Mmusculus.v79" 
## [148] "ensembldb"            "AnnotationFilter"     "GenomicFeatures"     
## [151] "AnnotationDbi"        "gridExtra"            "tximport"            
## [154] "DESeq2"               "SummarizedExperiment" "DelayedArray"        
## [157] "BiocParallel"         "matrixStats"          "Biobase"             
## [160] "GenomicRanges"        "GenomeInfoDb"         "IRanges"             
## [163] "S4Vectors"            "BiocGenerics"         "parallel"            
## [166] "stats4"               "stats"                "graphics"            
## [169] "grDevices"            "utils"                "datasets"            
## [172] "methods"              "base"                 "lattice"             
## [175] "ggplot2"              "grid"                 "EnsDb.Mmusculus.v79" 
## [178] "ensembldb"            "AnnotationFilter"     "GenomicFeatures"     
## [181] "AnnotationDbi"        "gridExtra"            "tximport"            
## [184] "DESeq2"               "SummarizedExperiment" "DelayedArray"        
## [187] "BiocParallel"         "matrixStats"          "Biobase"             
## [190] "GenomicRanges"        "GenomeInfoDb"         "IRanges"             
## [193] "S4Vectors"            "BiocGenerics"         "parallel"            
## [196] "stats4"               "stats"                "graphics"            
## [199] "grDevices"            "utils"                "datasets"            
## [202] "methods"              "base"                 "reshape"             
## [205] "lattice"              "ggplot2"              "grid"                
## [208] "EnsDb.Mmusculus.v79"  "ensembldb"            "AnnotationFilter"    
## [211] "GenomicFeatures"      "AnnotationDbi"        "gridExtra"           
## [214] "tximport"             "DESeq2"               "SummarizedExperiment"
## [217] "DelayedArray"         "BiocParallel"         "matrixStats"         
## [220] "Biobase"              "GenomicRanges"        "GenomeInfoDb"        
## [223] "IRanges"              "S4Vectors"            "BiocGenerics"        
## [226] "parallel"             "stats4"               "stats"               
## [229] "graphics"             "grDevices"            "utils"               
## [232] "datasets"             "methods"              "base"                
## [235] "mixOmics"             "MASS"                 "reshape"             
## [238] "lattice"              "ggplot2"              "grid"                
## [241] "EnsDb.Mmusculus.v79"  "ensembldb"            "AnnotationFilter"    
## [244] "GenomicFeatures"      "AnnotationDbi"        "gridExtra"           
## [247] "tximport"             "DESeq2"               "SummarizedExperiment"
## [250] "DelayedArray"         "BiocParallel"         "matrixStats"         
## [253] "Biobase"              "GenomicRanges"        "GenomeInfoDb"        
## [256] "IRanges"              "S4Vectors"            "BiocGenerics"        
## [259] "parallel"             "stats4"               "stats"               
## [262] "graphics"             "grDevices"            "utils"               
## [265] "datasets"             "methods"              "base"                
## [268] "gplots"               "mixOmics"             "MASS"                
## [271] "reshape"              "lattice"              "ggplot2"             
## [274] "grid"                 "EnsDb.Mmusculus.v79"  "ensembldb"           
## [277] "AnnotationFilter"     "GenomicFeatures"      "AnnotationDbi"       
## [280] "gridExtra"            "tximport"             "DESeq2"              
## [283] "SummarizedExperiment" "DelayedArray"         "BiocParallel"        
## [286] "matrixStats"          "Biobase"              "GenomicRanges"       
## [289] "GenomeInfoDb"         "IRanges"              "S4Vectors"           
## [292] "BiocGenerics"         "parallel"             "stats4"              
## [295] "stats"                "graphics"             "grDevices"           
## [298] "utils"                "datasets"             "methods"             
## [301] "base"                 "RColorBrewer"         "gplots"              
## [304] "mixOmics"             "MASS"                 "reshape"             
## [307] "lattice"              "ggplot2"              "grid"                
## [310] "EnsDb.Mmusculus.v79"  "ensembldb"            "AnnotationFilter"    
## [313] "GenomicFeatures"      "AnnotationDbi"        "gridExtra"           
## [316] "tximport"             "DESeq2"               "SummarizedExperiment"
## [319] "DelayedArray"         "BiocParallel"         "matrixStats"         
## [322] "Biobase"              "GenomicRanges"        "GenomeInfoDb"        
## [325] "IRanges"              "S4Vectors"            "BiocGenerics"        
## [328] "parallel"             "stats4"               "stats"               
## [331] "graphics"             "grDevices"            "utils"               
## [334] "datasets"             "methods"              "base"                
## [337] "readr"                "RColorBrewer"         "gplots"              
## [340] "mixOmics"             "MASS"                 "reshape"             
## [343] "lattice"              "ggplot2"              "grid"                
## [346] "EnsDb.Mmusculus.v79"  "ensembldb"            "AnnotationFilter"    
## [349] "GenomicFeatures"      "AnnotationDbi"        "gridExtra"           
## [352] "tximport"             "DESeq2"               "SummarizedExperiment"
## [355] "DelayedArray"         "BiocParallel"         "matrixStats"         
## [358] "Biobase"              "GenomicRanges"        "GenomeInfoDb"        
## [361] "IRanges"              "S4Vectors"            "BiocGenerics"        
## [364] "parallel"             "stats4"               "stats"               
## [367] "graphics"             "grDevices"            "utils"               
## [370] "datasets"             "methods"              "base"                
## [373] "dplyr"                "readr"                "RColorBrewer"        
## [376] "gplots"               "mixOmics"             "MASS"                
## [379] "reshape"              "lattice"              "ggplot2"             
## [382] "grid"                 "EnsDb.Mmusculus.v79"  "ensembldb"           
## [385] "AnnotationFilter"     "GenomicFeatures"      "AnnotationDbi"       
## [388] "gridExtra"            "tximport"             "DESeq2"              
## [391] "SummarizedExperiment" "DelayedArray"         "BiocParallel"        
## [394] "matrixStats"          "Biobase"              "GenomicRanges"       
## [397] "GenomeInfoDb"         "IRanges"              "S4Vectors"           
## [400] "BiocGenerics"         "parallel"             "stats4"              
## [403] "stats"                "graphics"             "grDevices"           
## [406] "utils"                "datasets"             "methods"             
## [409] "base"                 "VennDiagram"          "futile.logger"       
## [412] "dplyr"                "readr"                "RColorBrewer"        
## [415] "gplots"               "mixOmics"             "MASS"                
## [418] "reshape"              "lattice"              "ggplot2"             
## [421] "grid"                 "EnsDb.Mmusculus.v79"  "ensembldb"           
## [424] "AnnotationFilter"     "GenomicFeatures"      "AnnotationDbi"       
## [427] "gridExtra"            "tximport"             "DESeq2"              
## [430] "SummarizedExperiment" "DelayedArray"         "BiocParallel"        
## [433] "matrixStats"          "Biobase"              "GenomicRanges"       
## [436] "GenomeInfoDb"         "IRanges"              "S4Vectors"           
## [439] "BiocGenerics"         "parallel"             "stats4"              
## [442] "stats"                "graphics"             "grDevices"           
## [445] "utils"                "datasets"             "methods"             
## [448] "base"                 "clusterProfiler"      "VennDiagram"         
## [451] "futile.logger"        "dplyr"                "readr"               
## [454] "RColorBrewer"         "gplots"               "mixOmics"            
## [457] "MASS"                 "reshape"              "lattice"             
## [460] "ggplot2"              "grid"                 "EnsDb.Mmusculus.v79" 
## [463] "ensembldb"            "AnnotationFilter"     "GenomicFeatures"     
## [466] "AnnotationDbi"        "gridExtra"            "tximport"            
## [469] "DESeq2"               "SummarizedExperiment" "DelayedArray"        
## [472] "BiocParallel"         "matrixStats"          "Biobase"             
## [475] "GenomicRanges"        "GenomeInfoDb"         "IRanges"             
## [478] "S4Vectors"            "BiocGenerics"         "parallel"            
## [481] "stats4"               "stats"                "graphics"            
## [484] "grDevices"            "utils"                "datasets"            
## [487] "methods"              "base"                 "DOSE"                
## [490] "clusterProfiler"      "VennDiagram"          "futile.logger"       
## [493] "dplyr"                "readr"                "RColorBrewer"        
## [496] "gplots"               "mixOmics"             "MASS"                
## [499] "reshape"              "lattice"              "ggplot2"             
## [502] "grid"                 "EnsDb.Mmusculus.v79"  "ensembldb"           
## [505] "AnnotationFilter"     "GenomicFeatures"      "AnnotationDbi"       
## [508] "gridExtra"            "tximport"             "DESeq2"              
## [511] "SummarizedExperiment" "DelayedArray"         "BiocParallel"        
## [514] "matrixStats"          "Biobase"              "GenomicRanges"       
## [517] "GenomeInfoDb"         "IRanges"              "S4Vectors"           
## [520] "BiocGenerics"         "parallel"             "stats4"              
## [523] "stats"                "graphics"             "grDevices"           
## [526] "utils"                "datasets"             "methods"             
## [529] "base"                 "org.Mm.eg.db"         "DOSE"                
## [532] "clusterProfiler"      "VennDiagram"          "futile.logger"       
## [535] "dplyr"                "readr"                "RColorBrewer"        
## [538] "gplots"               "mixOmics"             "MASS"                
## [541] "reshape"              "lattice"              "ggplot2"             
## [544] "grid"                 "EnsDb.Mmusculus.v79"  "ensembldb"           
## [547] "AnnotationFilter"     "GenomicFeatures"      "AnnotationDbi"       
## [550] "gridExtra"            "tximport"             "DESeq2"              
## [553] "SummarizedExperiment" "DelayedArray"         "BiocParallel"        
## [556] "matrixStats"          "Biobase"              "GenomicRanges"       
## [559] "GenomeInfoDb"         "IRanges"              "S4Vectors"           
## [562] "BiocGenerics"         "parallel"             "stats4"              
## [565] "stats"                "graphics"             "grDevices"           
## [568] "utils"                "datasets"             "methods"             
## [571] "base"                 "pathview"             "org.Hs.eg.db"        
## [574] "org.Mm.eg.db"         "DOSE"                 "clusterProfiler"     
## [577] "VennDiagram"          "futile.logger"        "dplyr"               
## [580] "readr"                "RColorBrewer"         "gplots"              
## [583] "mixOmics"             "MASS"                 "reshape"             
## [586] "lattice"              "ggplot2"              "grid"                
## [589] "EnsDb.Mmusculus.v79"  "ensembldb"            "AnnotationFilter"    
## [592] "GenomicFeatures"      "AnnotationDbi"        "gridExtra"           
## [595] "tximport"             "DESeq2"               "SummarizedExperiment"
## [598] "DelayedArray"         "BiocParallel"         "matrixStats"         
## [601] "Biobase"              "GenomicRanges"        "GenomeInfoDb"        
## [604] "IRanges"              "S4Vectors"            "BiocGenerics"        
## [607] "parallel"             "stats4"               "stats"               
## [610] "graphics"             "grDevices"            "utils"               
## [613] "datasets"             "methods"              "base"                
## [616] "pathview"             "org.Hs.eg.db"         "org.Mm.eg.db"        
## [619] "DOSE"                 "clusterProfiler"      "VennDiagram"         
## [622] "futile.logger"        "dplyr"                "readr"               
## [625] "RColorBrewer"         "gplots"               "mixOmics"            
## [628] "MASS"                 "reshape"              "lattice"             
## [631] "ggplot2"              "grid"                 "EnsDb.Mmusculus.v79" 
## [634] "ensembldb"            "AnnotationFilter"     "GenomicFeatures"     
## [637] "AnnotationDbi"        "gridExtra"            "tximport"            
## [640] "DESeq2"               "SummarizedExperiment" "DelayedArray"        
## [643] "BiocParallel"         "matrixStats"          "Biobase"             
## [646] "GenomicRanges"        "GenomeInfoDb"         "IRanges"             
## [649] "S4Vectors"            "BiocGenerics"         "parallel"            
## [652] "stats4"               "stats"                "graphics"            
## [655] "grDevices"            "utils"                "datasets"            
## [658] "methods"              "base"                 "gtools"              
## [661] "pathview"             "org.Hs.eg.db"         "org.Mm.eg.db"        
## [664] "DOSE"                 "clusterProfiler"      "VennDiagram"         
## [667] "futile.logger"        "dplyr"                "readr"               
## [670] "RColorBrewer"         "gplots"               "mixOmics"            
## [673] "MASS"                 "reshape"              "lattice"             
## [676] "ggplot2"              "grid"                 "EnsDb.Mmusculus.v79" 
## [679] "ensembldb"            "AnnotationFilter"     "GenomicFeatures"     
## [682] "AnnotationDbi"        "gridExtra"            "tximport"            
## [685] "DESeq2"               "SummarizedExperiment" "DelayedArray"        
## [688] "BiocParallel"         "matrixStats"          "Biobase"             
## [691] "GenomicRanges"        "GenomeInfoDb"         "IRanges"             
## [694] "S4Vectors"            "BiocGenerics"         "parallel"            
## [697] "stats4"               "stats"                "graphics"            
## [700] "grDevices"            "utils"                "datasets"            
## [703] "methods"              "base"

Compile gene count files in DESeq2

Set working directory to the folder that contains only gene count txt files

# Generate a tx2gene object for matrix generation
edb <- EnsDb.Mmusculus.v79 
transcriptsID <- as.data.frame(transcripts(edb))
tx2gene <- as.data.frame(cbind(transcriptsID$tx_id, transcriptsID$gene_id))

# Generate DESeqData using the counting result generated using Salmon
setwd("/Users/mizuhi/OneDrive - Harvard University/Haigis Lab/Projects/Halo-Ago2/Halo-Ago-KRas/Raw Data/RNA-Seq/Mouse colon epithelium/Gene Counts")
inDir = getwd()
countFiles = list.files(inDir, pattern=".sf$", full.names = TRUE)
basename(countFiles)
## [1] "LIB037245_GEN00137835.sf" "LIB037245_GEN00137836.sf"
## [3] "LIB037245_GEN00137837.sf" "LIB037245_GEN00137838.sf"
## [5] "LIB037245_GEN00137839.sf" "LIB037245_GEN00137840.sf"
## [7] "LIB037245_GEN00137841.sf" "LIB037245_GEN00137842.sf"
names(countFiles) <- c('Fabp_1','Fabp_2','Fabp_3','Fabp_4','KrasG12D_1','KrasG12D_2','KrasG12D_3','KrasG12D_4')

txi.salmon <- tximport(countFiles, type = "salmon", tx2gene = tx2gene, ignoreTxVersion = TRUE)
## reading in files with read_tsv
## 1 2 3 4 5 6 7 8 
## transcripts missing from tx2gene: 27668
## summarizing abundance
## summarizing counts
## summarizing length
DESeqsampletable <- data.frame(condition = c('control','control','control','control','experimental','experimental','experimental','experimental'))

DESeqsampletable$gender <- factor(c("F", "M", "M", "M", "F", "F", "F", "M"))
rownames(DESeqsampletable) <- colnames(txi.salmon$counts)

ddsHTSeq<- DESeqDataSetFromTximport(txi.salmon, DESeqsampletable, ~  gender + condition)
## using counts and average transcript lengths from tximport
ddsHTSeq_norm <- estimateSizeFactors(ddsHTSeq)
## using 'avgTxLength' from assays(dds), correcting for library size
ddsHTSeq_norm <- DESeq(ddsHTSeq_norm)
## using pre-existing normalization factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
ddsHTSeq_analysis <- results(ddsHTSeq_norm, contrast = c("condition", "experimental", "control"))
ddsHTSeq_analysis <- lfcShrink(ddsHTSeq_norm, contrast = c("condition", "experimental", "control"), res = ddsHTSeq_analysis)
## using 'normal' for LFC shrinkage, the Normal prior from Love et al (2014).
## additional priors are available via the 'type' argument, see ?lfcShrink for details

MA plot was generated to inspect the correct shrinkage of LFC.

DESeq2::plotMA(ddsHTSeq_analysis)

Quality Inspection of the Gene Count Data

Generate raw count table that contains the simple counts of each gene

Data is transformed and pseudocount is calculated.

rawCountTable <- as.data.frame(DESeq2::counts(ddsHTSeq_norm, normalize = TRUE))
pseudoCount = log2(rawCountTable + 1)
grid.arrange(
  ggplot(pseudoCount, aes(x= Fabp_1)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "Fabp_1"), 
  ggplot(pseudoCount, aes(x= Fabp_2)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "Fabp_2"),
  ggplot(pseudoCount, aes(x= Fabp_3)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "Fabp_3"),
  ggplot(pseudoCount, aes(x= Fabp_4)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "Fabp_4"), nrow = 2)

grid.arrange(
  ggplot(pseudoCount, aes(x= KrasG12D_1)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "KrasG12D_1"),
  ggplot(pseudoCount, aes(x= KrasG12D_2)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "KrasG12D_2"),
  ggplot(pseudoCount, aes(x= KrasG12D_3)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "KrasG12D_3"),
  ggplot(pseudoCount, aes(x= KrasG12D_4)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") + 
    geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "KrasG12D_4"), nrow = 2)

Between-sample distribution

Check on the gene count distribution across all genes.

#Boxplots
suppressMessages(df <- melt(pseudoCount, variable_name = "Samples"))
df <- data.frame(df, Condition = substr(df$Samples,1,4))

ggplot(df, aes(x=Samples, y=value, fill = Condition)) + geom_boxplot() + xlab("") + 
  ylab(expression(log[2](count+1))) + scale_fill_manual(values = c("#619CFF", "#F564E3")) + theme(axis.text.x = element_text(angle = 90, hjust = 1))

#Histograms and density plots
ggplot(df, aes(x=value, colour = Samples, fill = Samples)) + ylim(c(0, 0.25)) + 
  geom_density(alpha = 0.2, size = 1.25) + facet_wrap(~ Condition) +
  theme(legend.position = "top") + xlab(expression(log[2](count+1)))

### Generate MA plots MA plots are used to check for imbalance in sequencing depth between samples of the same condition. I did not generate MA plot for all library pairs. But the example pairs I selected show that there are imbalance in sequencing depth, but the imbalance is quite random and this is common in RNA-Seq datasets.

## WT1 vs WT2
x = pseudoCount[, 1]
y = pseudoCount[, 2]
## M-values
M = x - y
## A-values
A = (x + y)/2
df = data.frame(A, M)

suppressWarnings(
ggplot(df, aes(x = A, y = M)) + geom_point(size = 1.5, alpha = 1/5) +
  geom_hline(color = "blue3", yintercept = 0) + stat_smooth(se = FALSE, method = "loess", color = "red3") + labs(title = "Fabp_1 vs Fabp_2"))

## WT1 vs WT3
x = pseudoCount[, 1]
y = pseudoCount[, 3]
## M-values
M = x - y
## A-values
A = (x + y)/2
df = data.frame(A, M)

suppressWarnings(
ggplot(df, aes(x = A, y = M)) + geom_point(size = 1.5, alpha = 1/5) +
  geom_hline(color = "blue3", yintercept = 0) + stat_smooth(se = FALSE, method = "loess", color = "red3") + labs(title = "Fabp_1 vs Fabp_3"))

## G12D_1 vs G12D_2
x = pseudoCount[, 5]
y = pseudoCount[, 6]
## M-values
M = x - y
## A-values
A = (x + y)/2
df = data.frame(A, M)

suppressWarnings(
ggplot(df, aes(x = A, y = M)) + geom_point(size = 1.5, alpha = 1/5) +
  geom_hline(color = "blue3", yintercept = 0) + stat_smooth(se = FALSE, method = "loess", color = "red3") + labs(title = "KrasG12D_1 vs KrasG12D_2"))

## G12D_1 vs G12D_3
x = pseudoCount[, 5]
y = pseudoCount[, 7]
## M-values
M = x - y
## A-values
A = (x + y)/2
df = data.frame(A, M)

suppressWarnings(
ggplot(df, aes(x = A, y = M)) + geom_point(size = 1.5, alpha = 1/5) +
  geom_hline(color = "blue3", yintercept = 0) + stat_smooth(se = FALSE, method = "loess", color = "red3") + labs(title = "KrasG12D_1 vs KrasG12D_3"))

Clustering of the sample-to-sample distances

This is the sanity check step to confirm that under a un-supervised clustering, WT and G12D samples will cluster together. For some reason, the code is giving error when try to plot this heatmap in RStudio, so I created a pdf file that contains the heatmap in the Analysis folder named Hierchical Clustering.pdf

ddsHTSeq_transform <- varianceStabilizingTransformation(ddsHTSeq_norm)
rawCountTable_transform <- as.data.frame(assay(ddsHTSeq_transform))
pseudoCount_transform = log2(rawCountTable_transform + 1)
mat.dist = pseudoCount_transform
mat.dist = as.matrix(dist(t(mat.dist)))
mat.dist = mat.dist/max(mat.dist)
setwd("/Users/mizuhi/OneDrive - Harvard University/Haigis Lab/Projects/Halo-Ago2/Halo-Ago-KRas/Raw Data/RNA-Seq/Mouse colon epithelium/Analysis")
png('Hierchical_Clustering.png')
cim(mat.dist, symkey = FALSE, margins = c(6, 6))
suppressMessages(dev.off())
## quartz_off_screen 
##                 2

Final output is following: Hierchical Clustering

Principal component plot of the samples

I performed PCA analysis on all datasets to confirm that samples from the same condition group together. This step has to be performed using varianceStabelizingTransformed dataset, so that the high variance in genes with low counts will not skew the data.

The top 500 most variable genes are selected for PCA analysis.

plotPCA(ddsHTSeq_transform, intgroup = "condition", ntop = 500)

Raw data filtering and Generate the raw count file with all detected genes

This step removes all genes with 0 counts across all samples, output a csv file and also generate a density plot using filtered dataset.

rawCountTable_no_normalization <- as.data.frame(DESeq2::counts(ddsHTSeq))
keep <- rowMeans(rawCountTable[,1:4]) > 50 | rowMeans(rawCountTable[,5:8]) > 50
filterCount <- pseudoCount[keep,]
df <- melt(filterCount, variable_name = "Samples")
## Using  as id variables
df <- data.frame(df, Condition = substr(df$Samples,1,4))
detected_raw_count_norm <- rawCountTable[keep,]
write.csv(detected_raw_count_norm, "normalized_raw_gene_counts.csv")
rawCountTable_no_normalization <- rawCountTable_no_normalization[keep,]
write.csv(rawCountTable_no_normalization, "raw_gene_counts.csv")


ggplot(df, aes(x=value, colour = Samples, fill = Samples)) + 
  geom_density(alpha = 0.2, size = 1.25) + facet_wrap(~ Condition) +
  theme(legend.position = "top") + xlab("pseudocounts")

Output txt file for CiberSort

# annotate gene count list with gene name and write out as txt file for CiberSort
# Return the Ensembl IDs for a set of genes
annotations_cs <- AnnotationDbi::select(EnsDb.Mmusculus.v79,
                                           keys = rownames(detected_raw_count_norm),
                                           columns = c("GENENAME"),
                                           keytype = "GENEID")

# Determine the indices for the non-duplicated genes
non_duplicates_cs <- which(duplicated(annotations_cs$GENENAME) == FALSE)

# Return only the non-duplicated genes using indices
annotations_cs <- annotations_cs[non_duplicates_cs, ]

# Check number of NAs returned
is.na(annotations_cs$GENENAME) %>%
  which() %>%
  length()
## [1] 0
# join the expression data with gene names
detected_raw_count_norm_cs <- as_tibble(detected_raw_count_norm, rownames = "ENSEMBL_ID")
detected_raw_count_norm_cs <- inner_join(detected_raw_count_norm_cs,annotations_cs, by=c("ENSEMBL_ID"="GENEID"))

detected_raw_count_norm_cs[,1] <- detected_raw_count_norm_cs[,dim(detected_raw_count_norm_cs)[2]]
detected_raw_count_norm_cs <- detected_raw_count_norm_cs[,-dim(detected_raw_count_norm_cs)[2]]
colnames(detected_raw_count_norm_cs)[1] <- "GENEID"

# convert the gene names to all capitalized letters for use in Cibersort
detected_raw_count_norm_cs$GENEID <- toupper(detected_raw_count_norm_cs$GENEID)

# I need to filter the list for genes that exist in the signature gene list
# load the LM22 signature gene list
LM22_signature <- read_csv("/Users/mizuhi/OneDrive - Harvard University/Haigis Lab/Projects/Halo-Ago2/Halo-Ago-KRas/Raw Data/RNA-Seq/Mouse colon epithelium/Analysis/CiberSort/LM22_signature.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   `Gene symbol` = col_character()
## )
## See spec(...) for full column specifications.
LM22_signature <- LM22_signature[,1]
colnames(LM22_signature)[1] <- "GENEID"
detected_raw_count_norm_cs <- inner_join(detected_raw_count_norm_cs,LM22_signature, by = c("GENEID" = "GENEID"))

setwd("/Users/mizuhi/OneDrive - Harvard University/Haigis Lab/Projects/Halo-Ago2/Halo-Ago-KRas/Raw Data/RNA-Seq/Mouse colon epithelium/Analysis/CiberSort")
write.csv(detected_raw_count_norm_cs, "normalized_raw_gene_counts_for_Cibersort.csv")

Once the result file is generated and exported as csv file, I input it here and perform differential analysis on them.

cs_result <- read_csv("/Users/mizuhi/OneDrive - Harvard University/Haigis Lab/Projects/Halo-Ago2/Halo-Ago-KRas/Raw Data/RNA-Seq/Mouse colon epithelium/Analysis/CiberSort/CIBERSORT.Output_Job2.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   `Input Sample` = col_character()
## )
## See spec(...) for full column specifications.
cs_result <- as.data.frame(cs_result)
rownames(cs_result) <- cs_result[,1]
cs_result <- cs_result[,-1]
cs_result <- t(cs_result)
cs_result <- cs_result[-c(23,24,25),]

# filter out cells that had 0 count
keep_cs <- rowSums(cs_result) > 0

# calculate the stats for each cell type with values
cs_result <- cs_result[keep_cs,]

# Calculate the pvalue using parametric unpaired t test
p_value_list <- c()
for (i in 1:dim(cs_result)[1]) {
  p_value <- t.test(unlist(cs_result[i,5:8]), unlist(cs_result[i,1:4]), paired = FALSE)$p.value
  p_value_list <- c(p_value_list, p_value)
}
cs_result <- cbind(cs_result, p_value_list)
colnames(cs_result)[9] <- "p_values" 

# calculate the q value using Benjamini Hochberg FDR correction
q_value_list <- p.adjust(cs_result[,9], method = "BH")
cs_result <- cbind(cs_result, q_value_list)
colnames(cs_result)[10] <- "q_values"

# calculate fold change and log fold change
foldchange_list <- c()
for (i in 1:dim(cs_result)[1]) {
  foldchange <- foldchange(mean(unlist(cs_result[i,5:8])), mean(unlist(cs_result[i,1:4])))
  foldchange_list <- c(foldchange_list, foldchange)
}
logfoldchange_list <- foldchange2logratio(foldchange_list)
cs_result <- cbind(cs_result, foldchange_list, logfoldchange_list)
colnames(cs_result)[11:12] <- c("foldChange", "LFC")

# output the analysis file
setwd("/Users/mizuhi/OneDrive - Harvard University/Haigis Lab/Projects/Halo-Ago2/Halo-Ago-KRas/Raw Data/RNA-Seq/Mouse colon epithelium/Analysis/CiberSort")
write.csv(cs_result, "CiberSort_de_analysis.csv")

Generate file with differential analysis result

This step generates the analysis file that contains results from differential analysis.

write.csv(as.data.frame(ddsHTSeq_analysis[keep,]), "Differential Analysis.csv")

Draw heatmap for transcripts that are significantly dysregulated in KRasG12D samples

Genes that were not detected were removed from the list. Genes with padj < 0.05 were considered significantly dysregulated. Their normalized counts were z-scored and used for plotting the heatmap.

suppressMessages(library(mosaic))

rawCountTable_transform_detected <- rawCountTable_transform[keep,]

dif_analysis <- as.data.frame(ddsHTSeq_analysis)[keep,]
sig_dif <- subset(dif_analysis, dif_analysis$padj < 0.05)
sig_index <- c()
for (i in 1:dim(sig_dif)[1]) {
  sig_index <- c(sig_index ,grep(rownames(sig_dif)[i], rownames(rawCountTable_transform_detected)))
}
sig_count <- rawCountTable_transform_detected[sig_index,]
sig_dif <- cbind(sig_dif, sig_count)
for (i in 1:dim(sig_dif)[1]) {
  sig_dif[i,7:14] <- zscore(as.numeric(sig_dif[i,7:14]))
}

my_palette <- colorRampPalette(c("red", "white", "blue"))(256)
heatmap_matrix <- as.matrix(sig_dif[,7:14])

png('G12D vs WT colon epithelium RNASeq.png',
    width = 600,
    height = 1400,
    res = 200,
    pointsize = 8)
par(cex.main=1.1)
heatmap.2(heatmap_matrix,
          main = "Differentially expressed\nRNA in colon epithelium\npadj < 0.05",
          density.info = "none",
          key = TRUE,
          lwid = c(3,7),
          lhei = c(1,7),
          col=my_palette,
          margins = c(12,2),
          symbreaks = TRUE,
          trace = "none",
          dendrogram = "row",
          labRow = FALSE,
          ylab = "Genes",
          cexCol = 2,
          Colv = "NA")
dev.off()
## quartz_off_screen 
##                 2

Final output is Heatmap for differential genes

Scatter plot, MA plot and Volcano plot for data visualization

# Scatter plot
detected_pseudocount <- pseudoCount[keep,]
dif_analysis$KrasG12D_mean <- rowMeans(detected_pseudocount[,5:8])
dif_analysis$KrasWT_mean <- rowMeans(detected_pseudocount[,1:4])
ggplot(dif_analysis, aes(x = KrasWT_mean, y = KrasG12D_mean)) +
  xlab("WT_Average(log2)") + ylab("G12D_Average(log2)") + 
  geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "grey") +
  geom_point(data = subset(dif_analysis, padj < 0.05 & log2FoldChange > 0), alpha = 0.5, size = 1, color = "red") +
  geom_point(data = subset(dif_analysis, padj < 0.05 & log2FoldChange < 0), alpha = 0.5, size = 1, color = "blue") +
  labs(title = "WT vs G12D Scatter Plot")

# MA plot
ggplot(dif_analysis, aes(x = log(baseMean,2), y = log2FoldChange,)) +
  xlab("Average Expression") + ylab("LFC") + 
  geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "grey") +
  geom_point(data = subset(dif_analysis, padj < 0.05 & log2FoldChange > 0), alpha = 0.5, size = 1, color = "red") +
  geom_point(data = subset(dif_analysis, padj < 0.05 & log2FoldChange < 0), alpha = 0.5, size = 1, color = "blue") +
labs(title = "WT vs G12D MA Plot")

# Volcano Plot
ggplot(dif_analysis, aes(x = log2FoldChange, y = -log(padj,10))) +
  xlab("LFC") + ylab("-log10(P value)") + 
  geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "black") +
  geom_point(data = subset(dif_analysis, padj < 0.05 & log2FoldChange > 0), alpha = 0.5, size = 1, color = "red") +
  geom_point(data = subset(dif_analysis, padj < 0.05 & log2FoldChange < 0), alpha = 0.5, size = 1, color = "blue") +
labs(title = "WT vs G12D Volcano Plot") +
  xlim(-3,3)
## Warning: Removed 33 rows containing missing values (geom_point).
## Warning: Removed 14 rows containing missing values (geom_point).
## Warning: Removed 7 rows containing missing values (geom_point).

GO analysis for DE genes

Classic GO analysis is performed here for all DE genes detected in this dataset. The reference list is list of genes detected in RNASeq. Three categories of GO terms are tested here, including biological process, molecular function and cellular component.

target_gene <- as.character(rownames(sig_dif))
detected_gene <- as.character(rownames(detected_pseudocount))

# Run GO enrichment analysis for biological process
ego_BP <- enrichGO(gene = target_gene, 
                universe = detected_gene,
                keyType = "ENSEMBL",
                OrgDb = org.Mm.eg.db, 
                ont = "BP", 
                pAdjustMethod = "BH", 
                pvalueCutoff = 0.05, 
                readable = TRUE)

# Output results from GO analysis to a table
cluster_summary_BP <- data.frame(ego_BP)

write.csv(cluster_summary_BP, "GO/GO analysis_BP.csv")

# Run GO enrichment analysis for molecular function 
ego_MF <- enrichGO(gene = target_gene, 
                universe = detected_gene,
                keyType = "ENSEMBL",
                OrgDb = org.Mm.eg.db, 
                ont = "MF", 
                pAdjustMethod = "BH", 
                pvalueCutoff = 0.05, 
                readable = TRUE)

# Output results from GO analysis to a table
cluster_summary_MF <- data.frame(ego_MF)

write.csv(cluster_summary_MF, "GO/GO analysis_MF.csv")

# Run GO enrichment analysis for cellular component 
ego_CC <- enrichGO(gene = target_gene, 
                universe = detected_gene,
                keyType = "ENSEMBL",
                OrgDb = org.Mm.eg.db, 
                ont = "CC", 
                pAdjustMethod = "BH", 
                pvalueCutoff = 0.05, 
                readable = TRUE)

# Output results from GO analysis to a table
cluster_summary_CC <- data.frame(ego_CC)

write.csv(cluster_summary_CC, "GO/GO analysis_CC.csv")

Draw Dotplot representing the results

Biological process
png('GO/GO dotplot_BP.png',
    width = 1600,
    height = 1600,
    res = 100,
    pointsize = 8)
dotplot(ego_BP, showCategory=50)
dev.off()
## quartz_off_screen 
##                 2

Final output is following: BP_dotplot

Molecular function
png('GO/GO dotplot_MF.png',
    width = 1600,
    height = 1600,
    res = 100,
    pointsize = 8)
dotplot(ego_MF, showCategory=50)
dev.off()
## quartz_off_screen 
##                 2

Final output is following: MF_dotplot

Cellular component
png('GO/GO dotplot_CC.png',
    width = 1600,
    height = 1600,
    res = 100,
    pointsize = 8)
dotplot(ego_CC, showCategory=50)
dev.off()
## quartz_off_screen 
##                 2

Final output is following: CC_dotplot

Draw enrichment Go plot representing the results

Biological process
png('GO/GO enrichment_BP.png',
    width = 1600,
    height = 1600,
    res = 100,
    pointsize = 8)
emapplot(ego_BP, showCategory = 50)
dev.off()
## quartz_off_screen 
##                 2

Final output is following: BP_enrichplot

Molecular function
png('GO/GO enrichment_MF.png',
    width = 1600,
    height = 1600,
    res = 100,
    pointsize = 8)
emapplot(ego_MF, showCategory = 50)
dev.off()
## quartz_off_screen 
##                 2

Final output is following: MF_enrichplot

Cellular component
png('GO/GO enrichment_CC.png',
    width = 1600,
    height = 1600,
    res = 100,
    pointsize = 8)
emapplot(ego_CC, showCategory = 50)
dev.off()
## quartz_off_screen 
##                 2

Final output is following: CC_enrichplot

Draw category netplot representing the results

The category netplot shows the relationships between the genes associated with the top five most significant GO terms and the fold changes of the significant genes associated with these terms (color).

Biological process
OE_foldchanges <- sig_dif$log2FoldChange
names(OE_foldchanges) <- rownames(sig_dif)

png('GO/GO cnetplot_BP.png',
    width = 1600,
    height = 1600,
    res = 100,
    pointsize = 8)
cnetplot(ego_BP, 
         categorySize="pvalue", 
         showCategory = 5, 
         foldChange=OE_foldchanges, 
         vertex.label.font=6)
dev.off()
## quartz_off_screen 
##                 2

Final output is following: BP_cnetplot

Molecular function
png('GO/GO cnetplot_MF.png',
    width = 1600,
    height = 1600,
    res = 100,
    pointsize = 8)
cnetplot(ego_MF, 
         categorySize="pvalue", 
         showCategory = 5, 
         foldChange=OE_foldchanges, 
         vertex.label.font=6)
dev.off()
## quartz_off_screen 
##                 2

Final output is following: MF_cnetplot

Cellular component
png('GO/GO cnetplot_CC.png',
    width = 1600,
    height = 1600,
    res = 100,
    pointsize = 8)
cnetplot(ego_CC, 
         categorySize="pvalue", 
         showCategory = 5, 
         foldChange=OE_foldchanges, 
         vertex.label.font=6)
dev.off()
## quartz_off_screen 
##                 2

Final output is following: CC_cnetplot

Overlaping tumor transcriptomics and proteomics

For all proteins that were detected in the scraped colon proteomics, I extracted their Uniprot ID and matched them with Ensembl Gene Stable ID using BioMart on Ensembl. The information is stored in Uniprot_2_Ensembl.csv. I am also using the EnsDb.Mmusculus.v79 package to match Ensembl ID to Uniprot ID. Not sure which one gives me more mathces.

The proteomics file that I am using here is generated by Joao, tissue collect done by Emily and differential analysis done by Doug. The raw quantification file is 2015-03_HaigisMouseColon8plex_Prot. The differential analysis file is

Uniprot_2_Ensembl <- read_csv("/Users/mizuhi/OneDrive - Harvard University/Haigis Lab/Projects/Halo-Ago2/Halo-Ago-KRas/Raw Data/RNA-Seq/Mouse colon epithelium/Analysis/Uniprot_2_Ensembl.csv")
## Parsed with column specification:
## cols(
##   `Gene stable ID` = col_character(),
##   `UniProtKB/Swiss-Prot ID` = col_character()
## )
proteomics_quant <- read_csv("/Users/mizuhi/OneDrive - Harvard University/Haigis Lab/Projects/Halo-Ago2/Halo-Ago-KRas/Raw Data/Proteomics data/scraped colon/2015-03_HaigisMouseColon8plex_Prot.csv")
## Warning: Missing column names filled in: 'X2' [2]
## Parsed with column specification:
## cols(
##   `Protein Id` = col_character(),
##   X2 = col_character(),
##   `Gene Symbol` = col_character(),
##   Description = col_character(),
##   FABP_1 = col_double(),
##   FABP_2 = col_double(),
##   FABP_4 = col_double(),
##   FABP_5 = col_double(),
##   G12D_1 = col_double(),
##   G12D_2 = col_double(),
##   G12D_3 = col_double(),
##   G12D_4 = col_double()
## )
proteomics_diff <- read_csv("/Users/mizuhi/OneDrive - Harvard University/Haigis Lab/Projects/Halo-Ago2/Halo-Ago-KRas/Raw Data/Proteomics data/scraped colon/ceMS_diff.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   X1 = col_double(),
##   `Protein Id` = col_character(),
##   X2 = col_character(),
##   `Gene Symbol` = col_character(),
##   p_values = col_double(),
##   q_values = col_double(),
##   foldChange = col_double(),
##   LFC = col_double()
## )

This is to annotate the inputs in proteomics quantification file with their corresponding Ensembl ID.

Now I try to match Uniprot to Ensembl ID using EnsDb.Mmusculus.v79 package

# Return the Ensembl IDs for a set of genes
annotations_edb <- AnnotationDbi::select(EnsDb.Mmusculus.v79,
                                           keys = proteomics_quant$`Protein Id`,
                                           columns = c("GENEID", "GENENAME"),
                                           keytype = "UNIPROTID")

# Determine the indices for the non-duplicated genes
non_duplicates_idx <- which(duplicated(annotations_edb$UNIPROTID) == FALSE)
non_duplicates_idx <- which(duplicated(annotations_edb$GENEID) == FALSE)

# Return only the non-duplicated genes using indices
annotations_edb <- annotations_edb[non_duplicates_idx, ]

# Check number of NAs returned
is.na(annotations_edb$GENEID) %>%
  which() %>%
  length()
## [1] 0

Compare BioMart ID conversion output with ID mathcing output from the EnsDb.Mmusculus.v79 package.

# number of matches using EnsDb.Mmusculus.v79
dim(annotations_edb)[1]
## [1] 7665
# number of matches using BioMart
dim(Uniprot_2_Ensembl)[1]
## [1] 6892

It seems that matching using the EnsDb.Mmusculus.v79 and AnnotationDbipackage is superior. I will use this method for ID conversion in the future.

I need to add the Ensembl ID to the proteomics quantification dataframe.

proteomics_quant <- as_tibble(proteomics_quant)
proteomics_diff <- as_tibble(proteomics_diff)
dif_analysis$GeneID <- rownames(dif_analysis)
dif_analysis_tib <- as_tibble(dif_analysis)
proteomics_quant <- inner_join(proteomics_quant, annotations_edb, by=c("Protein Id"="UNIPROTID"))
proteomics_diff <- inner_join(proteomics_diff, annotations_edb, by=c("Protein Id"="UNIPROTID"))

Then I overlap the Stable Gene IDs in Proteomics with the ones in detected in transcriptomics to find outputs that were detected in both large datasets. For the overlapped proteins, the log fold change for their transcript expression and protein expression were extracted for testing their correlation.

overlap <- intersect(rownames(detected_raw_count_norm), proteomics_quant$GENEID)
overlap_lfc <- inner_join(dif_analysis_tib[,c(2,9)], proteomics_diff[,c(8,9)], by=c("GeneID"="GENEID"))
colnames(overlap_lfc) <- c('rna_lfc', 'GeneID', 'protein_lfc')
overlap_lfc$rna_lfc <- as.numeric(as.character(overlap_lfc$rna_lfc))
overlap_lfc$protein_lfc <- as.numeric(as.character(overlap_lfc$protein_lfc))

# Plot scatterplot and calculate the Pearson Correlation
ggplot(overlap_lfc, aes(x = rna_lfc, y = protein_lfc)) +
  geom_point(data = overlap_lfc, alpha = 0.5, size = 1, color = "black") +
  xlab("RNA LFC (KRas/WT)") + ylab("Protein LFC (KRas/WT)") + 
  labs(title = "Correlation between proteomics and transcriptomics") +
  xlim(-3,3) + ylim(-3,3)
## Warning: Removed 7 rows containing missing values (geom_point).

# Correlation test
cor.test(overlap_lfc$rna_lfc, overlap_lfc$protein_lfc)
## 
##  Pearson's product-moment correlation
## 
## data:  x and y
## t = 36.737, df = 6853, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.3856570 0.4252171
## sample estimates:
##      cor 
## 0.405627

Zooming into genes/proteins that are dysregulated in transcriptomics and proteomics

This analysis is performed only on proteins that are detected in both datasets.

overlap_p <- inner_join(dif_analysis_tib[,c(6,9)], proteomics_diff[,c(5,6,9)], by=c("GeneID"="GENEID"))
colnames(overlap_p) <- c('rna_p', 'GeneID', 'protein_p', 'protein_q')
overlap_p$rna_p <- as.numeric(as.character(overlap_p$rna_p))
overlap_p$protein_p <- as.numeric(as.character(overlap_p$protein_p))
overlap_p$protein_q <- as.numeric(as.character(overlap_p$protein_q))

Making a Venn Diagram to show the overlap between DE genes and DE proteins

rna_de <- subset(overlap_p, overlap_p$rna_p < 0.05)$GeneID
protein_de <- subset(overlap_p, overlap_p$protein_p < 0.05 & overlap_p$protein_q < 0.1)$GeneID
overlap_de <- intersect(rna_de, protein_de)
grid.newpage()
draw.pairwise.venn(length(rna_de),
                   length(protein_de),
                   length(overlap_de),
                   catergory <- c("DE_Transcriptomics",
                                  "DE_Proteomics"),
                   lty = "blank",
                   ex.text = FALSE,
                   fill = c("pink", "lightblue"),
                   cat.pos = c(200, 135), cat.dist = 0.05, margin = 0.05,
                   fontfamily = "sans", cat.fontfamily = "sans")

## (polygon[GRID.polygon.3451], polygon[GRID.polygon.3452], polygon[GRID.polygon.3453], polygon[GRID.polygon.3454], text[GRID.text.3455], text[GRID.text.3456], text[GRID.text.3457], text[GRID.text.3458], text[GRID.text.3459])